diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..b003b49 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,43 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Needed for hatch-vcs to determine version + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + version: "latest" + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --all-groups + + - name: Run tests with pytest + run: uv run pytest -v --cov=xarray_ome --cov-report=xml --cov-report=term + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.11' + uses: codecov/codecov-action@v5 + with: + file: ./coverage.xml + fail_ci_if_error: false diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..c48a428 --- /dev/null +++ b/TODO.md @@ -0,0 +1,62 @@ +# TODO List + +## High Priority + +### Zarr Backend Fallback + +Currently, when `xr.open_dataset(..., engine="ome-zarr")` is called on a regular (non-OME-NGFF) zarr file, it will raise an error. We should detect this case and automatically fall back to xarray's native zarr backend. + +**Implementation notes:** + +- The backend's `open_dataset()` and `open_datatree()` already check `_detect_store_type()` +- They try to fall back when `store_type == "unknown"` +- However, the fallback currently calls `open_ome_dataset()` which also checks and raises ValueError +- Need to either: + 1. Catch ValueError in backend and retry with zarr engine + 2. Pass a flag to `open_ome_dataset()` to skip the check + 3. Restructure so backend does fallback before calling `open_ome_dataset()` + +**Related code:** + +- `xarray_ome/backend.py`: Backend entry point with fallback logic +- `xarray_ome/reader.py`: `open_ome_dataset()` and `open_ome_datatree()` with detection +- `xarray_ome/_store_utils.py`: `_detect_store_type()` function + +## Medium Priority + +### Add working code example at docs homepage start + +The docs homepage should have a concrete working example right at the start that users can copy-paste and run immediately. + +### HCS Plate Structure Support + +Add support for High Content Screening (HCS) plate/well structures from OME-NGFF. + +**References:** + +- + +### Performance Optimizations + +- Benchmark and optimize coordinate transformation +- Consider caching metadata parsing +- Profile large file opening + +### Integration with Visualization Tools + +- Examples with matplotlib/napari +- Helper functions for common viz patterns + +## Low Priority + +### Additional Metadata Validation Options + +- More granular validation controls +- Custom validation rules +- Better error messages for invalid metadata + +### Time Label Support + +Infrastructure is ready in `transforms_to_coords()` with `time_labels` parameter, but OME-NGFF spec doesn't currently define standard time labels location (unlike channel labels in `omero.channels[].label`). + +Wait for spec extension before implementing. diff --git a/docs/api.md b/docs/api.md index 5d6a65b..754adf5 100644 --- a/docs/api.md +++ b/docs/api.md @@ -245,59 +245,98 @@ write_ome_datatree(dt, "modified.ome.zarr") ## Metadata Attributes -### DataTree Attributes +### Overview -When using `open_ome_datatree()`, the root node contains: +OME-NGFF metadata is broken out into separate, readable attributes for easy access. The complete metadata dict is also preserved for round-tripping. -- **ome_ngff_metadata** : `dict` +### Common Attributes (DataTree and Dataset) - Complete OME-NGFF metadata including: - - `axes`: Axis definitions (name, type, unit) - - `datasets`: Dataset paths and coordinate transformations - - `version`: OME-NGFF specification version - - `name`: Image name - - Additional metadata fields +These attributes are present in both DataTree root nodes and Datasets: -### Dataset Attributes +- **ome_name** : `str` -Each Dataset (from `open_ome_dataset()` or DataTree child nodes) contains: + Image name from OME-NGFF metadata. + Example: `'image'` -- **ome_scale** : `dict[str, float]` +- **ome_version** : `str` - Scale factors for each dimension. Maps dimension names to scale values. - Example: `{'c': 1.0, 'z': 0.5, 'y': 0.36, 'x': 0.36}` + OME-NGFF specification version. + Example: `'0.4'` -- **ome_translation** : `dict[str, float]` +- **ome_axes** : `list[str]` - Translation offsets for each dimension. Maps dimension names to offset values. - Example: `{'c': 0.0, 'z': 0.0, 'y': 0.0, 'x': 0.0}` + Axis names in order. + Example: `['c', 'z', 'y', 'x']` + +- **ome_axes_types** : `list[str | None]` -- **ome_axes_units** : `dict[str, str | None]` + Axis types (e.g., 'channel', 'space', 'time'). + Example: `['channel', 'space', 'space', 'space']` - Physical units for each dimension. - Example: `{'c': None, 'z': 'micrometer', 'y': 'micrometer', 'x': 'micrometer'}` +- **ome_axes_units** : `dict[str, str]` (optional) + + Physical units for axes that have them. + Example: `{'z': 'micrometer', 'y': 'micrometer', 'x': 'micrometer'}` - **ome_axes_orientations** : `dict[str, str]` (optional) Anatomical orientation for spatial axes (if RFC-4 metadata present). -- **ome_ngff_resolution** : `int` +- **ome_num_resolutions** : `int` - The resolution level index (only in Datasets from `open_ome_dataset()`). + Number of resolution levels in the multiscale pyramid. + Example: `3` + +- **ome_multiscale_paths** : `list[str]` + + Paths to each resolution level. + Example: `['0', '1', '2']` + +- **ome_channels** : `list[str]` (optional) + + Channel labels from `omero.channels[].label`. + Example: `['LaminB1', 'Dapi']` + +- **ome_channel_colors** : `list[str]` (optional) + + Channel colors (hex RGB) from `omero.channels[].color`. + Example: `['0000FF', 'FFFF00']` + +- **ome_channel_windows** : `list[dict]` (optional) + + Rendering window settings for each channel from `omero.channels[].window`. + Example: `[{'min': 0.0, 'max': 65535.0, 'start': 0.0, 'end': 1500.0}, ...]` - **ome_ngff_metadata** : `dict` - Full OME-NGFF metadata (only in Datasets from `open_ome_dataset()`). + Complete OME-NGFF metadata for round-tripping. Contains all metadata + fields in their original nested structure. + +### Dataset-Only Attributes + +Datasets also contain coordinate transformation information: + +- **ome_scale** : `dict[str, float]` + + Scale factors for each dimension. Maps dimension names to scale values. + Example: `{'c': 1.0, 'z': 0.5, 'y': 0.36, 'x': 0.36}` + +- **ome_translation** : `dict[str, float]` + + Translation offsets for each dimension. Maps dimension names to offset values. + Example: `{'c': 0.0, 'z': 0.0, 'y': 0.0, 'x': 0.0}` + +- **ome_ngff_resolution** : `int` + + The resolution level index (only in Datasets from `open_ome_dataset()`). - **ome_image_name** : `str` (optional) - Image name from OME-NGFF metadata, if present. + Image name from OME-NGFF metadata (duplicates `ome_name` for backward compatibility). - **ome_channel_labels** : `list[str]` (optional) - Channel labels extracted from `omero.channels[].label` metadata. - Used as coordinate values for the channel dimension when available. - Example: `['LaminB1', 'Dapi']` + Channel labels (duplicates `ome_channels` for backward compatibility). ### Coordinate Labels diff --git a/docs/index.md b/docs/index.md index b5cc61b..9a56885 100644 --- a/docs/index.md +++ b/docs/index.md @@ -139,6 +139,7 @@ graph LR :maxdepth: 2 usage +metadata_mapping examples api contributing diff --git a/docs/metadata_mapping.md b/docs/metadata_mapping.md new file mode 100644 index 0000000..74a5a06 --- /dev/null +++ b/docs/metadata_mapping.md @@ -0,0 +1,283 @@ +# OME-NGFF to xarray Mapping + +This page documents how each component of the OME-NGFF metadata specification is mapped to xarray data structures. + +## Design Philosophy + +**Core Principle**: Metadata that can be represented in xarray's native data model (coordinates, dimension names) is stored there, not duplicated in attrs. + +- **xarray coordinates**: Represent actual data (physical positions, channel labels) +- **xarray dimensions**: Represent axis names +- **xarray attrs**: Store metadata that has no native xarray representation + +This design ensures: + +- Natural xarray workflows (`.sel()`, `.isel()`, coordinate-based indexing) +- No redundancy between coords and attrs +- Full round-trip fidelity via preserved metadata dict + +--- + +## Metadata Mapping Reference + +### axes + +**OME-NGFF Spec**: [§2.1 Axes](https://ngff.openmicroscopy.org/0.5/#axes-md) + +OME-NGFF axes metadata describes the dimensions of the array. + +| OME-NGFF Field | xarray Location | Notes | +|----------------|-----------------|-------| +| `axes[].name` | **Dataset.dims** | Axis names become dimension names (e.g., `['c', 'z', 'y', 'x']`) | +| `axes[].type` | `attrs['ome_axes_types']` | List of types (e.g., `['channel', 'space', 'space', 'space']`) | +| `axes[].unit` | `attrs['ome_axes_units']` | Dict mapping axis name to unit (e.g., `{'z': 'micrometer'}`) | + +**Example:** + +```python +ds = xr.open_dataset("image.ome.zarr", engine="ome-zarr") + +# Axis names → dimensions +print(ds.dims) # {'c': 2, 'z': 236, 'y': 275, 'x': 271} + +# Axis types → attrs +print(ds.attrs['ome_axes_types']) # ['channel', 'space', 'space', 'space'] + +# Axis units → attrs +print(ds.attrs['ome_axes_units']) # {'z': 'micrometer', 'y': 'micrometer', 'x': 'micrometer'} +``` + +--- + +### coordinateTransformations + +**OME-NGFF Spec**: [§2.3 Coordinate Transformations](https://ngff.openmicroscopy.org/0.5/#trafo-md) + +Coordinate transformations define the mapping from array indices to physical coordinates. + +| OME-NGFF Field | xarray Location | Notes | +|----------------|-----------------|-------| +| `scale` transformation | **Dataset.coords** | Converted to coordinate arrays via `translation + scale * arange(size)` | +| `translation` transformation | **Dataset.coords** | Offset applied to coordinate arrays | +| Original values | `attrs['ome_scale']`, `attrs['ome_translation']` | Preserved for efficient round-tripping | + +**Example:** + +```python +# OME-NGFF metadata: +# scale = {'z': 0.5, 'y': 0.36, 'x': 0.36} +# translation = {'z': 0.0, 'y': 0.0, 'x': 0.0} + +ds = xr.open_dataset("image.ome.zarr", engine="ome-zarr") + +# Coordinates derived from transforms +print(ds.coords['z'].values[:3]) # [0.0, 0.5, 1.0] (0 + 0.5 * [0,1,2,...]) +print(ds.coords['y'].values[:3]) # [0.0, 0.36, 0.72] + +# Original transforms preserved +print(ds.attrs['ome_scale']) # {'z': 0.5, 'y': 0.36, 'x': 0.36} +``` + +**Round-trip:** + +When writing, `coords_to_transforms()` extracts scale and translation from coordinates, or uses stored values for exact fidelity. + +--- + +### multiscales + +**OME-NGFF Spec**: [§2.4 Multiscales](https://ngff.openmicroscopy.org/0.5/#multiscale-md) + +Multiscales metadata describes the image pyramid structure. + +| OME-NGFF Field | xarray Location | Notes | +|----------------|-----------------|-------| +| `name` | `attrs['ome_name']` | Image identifier | +| `version` | `attrs['ome_version']` | OME-NGFF spec version | +| `type` | Not currently mapped | Downscaling method | +| `metadata` | Not currently mapped | Additional downscaling info | +| `datasets[].path` | `attrs['ome_multiscale_paths']` | List of resolution paths (e.g., `['0', '1', '2']`) | +| Number of datasets | `attrs['ome_num_resolutions']` | Count of resolution levels | +| `coordinateTransformations` | **Dataset.coords** (per dataset) | Applied per resolution level | + +**Example:** + +```python +dt = xr.open_datatree("image.ome.zarr", engine="ome-zarr") + +# Multiscale info in root attrs +print(dt.attrs['ome_name']) # 'image' +print(dt.attrs['ome_version']) # '0.4' +print(dt.attrs['ome_num_resolutions']) # 3 +print(dt.attrs['ome_multiscale_paths']) # ['0', '1', '2'] + +# Each resolution as a separate DataTree node +print(list(dt.children.keys())) # ['scale0', 'scale1', 'scale2'] +``` + +--- + +### omero + +**OME-NGFF Spec**: [§2.5 OMERO Metadata (Transitional)](https://ngff.openmicroscopy.org/0.5/#omero-md) + +OMERO metadata provides channel information and rendering settings. + +| OME-NGFF Field | xarray Location | Notes | +|----------------|-----------------|-------| +| `omero.channels[].label` | **Dataset.coords['c']** | Channel labels as coordinate values (string dtype) | +| `omero.channels[].color` | `attrs['ome_channel_colors']` | List of hex color codes (e.g., `['0000FF', 'FFFF00']`) | +| `omero.channels[].window` | `attrs['ome_channel_windows']` | List of rendering window dicts | +| Other OMERO fields | `attrs['ome_ngff_metadata']['omero']` | Preserved in full metadata dict | + +**Example:** + +```python +ds = xr.open_dataset("image.ome.zarr", engine="ome-zarr") + +# Channel labels → coordinates (PRIMARY LOCATION) +print(ds.coords['c'].values) # array(['LaminB1', 'Dapi'], dtype=' xarray conversion.""" + +import numpy as np +import xarray as xr + +from xarray_ome.metadata import metadata_to_xarray_attrs, xarray_to_metadata + + +class TestMetadataToXarrayAttrs: + """Test conversion from OME-NGFF metadata to xarray attrs.""" + + def test_basic_metadata(self) -> None: + """Test extraction of basic name and version.""" + metadata = { + "name": "test_image", + "version": "0.4", + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_name"] == "test_image" + assert attrs["ome_version"] == "0.4" + assert attrs["ome_ngff_metadata"] == metadata + + def test_axes_types(self) -> None: + """Test extraction of axis types.""" + metadata = { + "axes": [ + {"name": "c", "type": "channel"}, + {"name": "z", "type": "space"}, + {"name": "y", "type": "space"}, + {"name": "x", "type": "space"}, + ], + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_axes_types"] == ["channel", "space", "space", "space"] + + def test_axes_units(self) -> None: + """Test extraction of axis units.""" + metadata = { + "axes": [ + {"name": "c", "type": "channel"}, + {"name": "z", "type": "space", "unit": "micrometer"}, + {"name": "y", "type": "space", "unit": "micrometer"}, + {"name": "x", "type": "space", "unit": "micrometer"}, + ], + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_axes_units"] == { + "z": "micrometer", + "y": "micrometer", + "x": "micrometer", + } + + def test_axes_orientations(self) -> None: + """Test extraction of axis orientations.""" + metadata = { + "axes": [ + {"name": "z", "type": "space", "orientation": "anterior-posterior"}, + {"name": "y", "type": "space", "orientation": "left-right"}, + {"name": "x", "type": "space", "orientation": "superior-inferior"}, + ], + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_axes_orientations"] == { + "z": "anterior-posterior", + "y": "left-right", + "x": "superior-inferior", + } + + def test_multiscale_info(self) -> None: + """Test extraction of multiscale paths and count.""" + metadata = { + "datasets": [ + {"path": "0"}, + {"path": "1"}, + {"path": "2"}, + ], + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_multiscale_paths"] == ["0", "1", "2"] + assert attrs["ome_num_resolutions"] == 3 + + def test_channel_colors(self) -> None: + """Test extraction of channel colors.""" + metadata = { + "omero": { + "channels": [ + {"label": "DAPI", "color": "0000FF"}, + {"label": "GFP", "color": "00FF00"}, + ], + }, + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_channel_colors"] == ["0000FF", "00FF00"] + + def test_channel_windows(self) -> None: + """Test extraction of channel rendering windows.""" + metadata = { + "omero": { + "channels": [ + { + "label": "DAPI", + "window": {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 1500.0}, + }, + { + "label": "GFP", + "window": {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 2000.0}, + }, + ], + }, + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert len(attrs["ome_channel_windows"]) == 2 + assert attrs["ome_channel_windows"][0]["end"] == 1500.0 + assert attrs["ome_channel_windows"][1]["end"] == 2000.0 + + def test_full_metadata_preserved(self) -> None: + """Test that full metadata dict is always preserved.""" + metadata = { + "name": "test", + "version": "0.4", + "custom_field": "should_be_preserved", + "nested": {"deep": {"field": 123}}, + } + + attrs = metadata_to_xarray_attrs(metadata) + + assert attrs["ome_ngff_metadata"] == metadata + assert attrs["ome_ngff_metadata"]["custom_field"] == "should_be_preserved" + assert attrs["ome_ngff_metadata"]["nested"]["deep"]["field"] == 123 + + def test_channel_labels_not_in_attrs(self) -> None: + """Test that channel labels are NOT extracted to attrs (they go in coords).""" + metadata = { + "omero": { + "channels": [ + {"label": "DAPI"}, + {"label": "GFP"}, + ], + }, + } + + attrs = metadata_to_xarray_attrs(metadata) + + # Channel labels should NOT be in attrs + assert "ome_channels" not in attrs + assert "ome_channel_labels" not in attrs + + +class TestXarrayToMetadata: + """Test conversion from xarray Dataset back to OME-NGFF metadata.""" + + def test_basic_reconstruction(self) -> None: + """Test reconstruction of basic metadata.""" + ds = xr.Dataset( + {"image": xr.DataArray(np.zeros((10, 10)), dims=["y", "x"])}, + ) + ds.attrs["ome_name"] = "test_image" + ds.attrs["ome_version"] = "0.4" + + metadata = xarray_to_metadata(ds, preserve_original=False) + + assert metadata["name"] == "test_image" + assert metadata["version"] == "0.4" + + def test_axes_reconstruction(self) -> None: + """Test reconstruction of axes metadata.""" + ds = xr.Dataset( + {"image": xr.DataArray(np.zeros((2, 10, 10)), dims=["c", "y", "x"])}, + ) + ds.attrs["ome_axes_types"] = ["channel", "space", "space"] + ds.attrs["ome_axes_units"] = {"y": "micrometer", "x": "micrometer"} + + metadata = xarray_to_metadata(ds, preserve_original=False) + + assert len(metadata["axes"]) == 3 + assert metadata["axes"][0] == {"name": "c", "type": "channel"} + assert metadata["axes"][1] == {"name": "y", "type": "space", "unit": "micrometer"} + assert metadata["axes"][2] == {"name": "x", "type": "space", "unit": "micrometer"} + + def test_channel_labels_from_coords(self) -> None: + """Test reconstruction of channel labels from coordinate values.""" + ds = xr.Dataset( + { + "image": xr.DataArray( + np.zeros((2, 10, 10)), + dims=["c", "y", "x"], + coords={"c": ["DAPI", "GFP"]}, + ), + }, + ) + ds.attrs["ome_channel_colors"] = ["0000FF", "00FF00"] + + metadata = xarray_to_metadata(ds, preserve_original=False) + + assert "omero" in metadata + assert "channels" in metadata["omero"] + channels = metadata["omero"]["channels"] + assert len(channels) == 2 + assert channels[0]["label"] == "DAPI" + assert channels[0]["color"] == "0000FF" + assert channels[1]["label"] == "GFP" + assert channels[1]["color"] == "00FF00" + + def test_preserve_original_metadata(self) -> None: + """Test that original metadata is preserved when requested.""" + original_metadata = { + "name": "original", + "version": "0.4", + "custom_field": "preserved", + "nested": {"data": 123}, + } + + ds = xr.Dataset( + {"image": xr.DataArray(np.zeros((10, 10)), dims=["y", "x"])}, + ) + ds.attrs["ome_ngff_metadata"] = original_metadata + ds.attrs["ome_name"] = "modified" # This should override + + metadata = xarray_to_metadata(ds, preserve_original=True) + + # Updated field + assert metadata["name"] == "modified" + # Preserved custom fields + assert metadata["custom_field"] == "preserved" + assert metadata["nested"]["data"] == 123 + + def test_channel_windows_reconstruction(self) -> None: + """Test reconstruction of channel rendering windows.""" + ds = xr.Dataset( + { + "image": xr.DataArray( + np.zeros((2, 10, 10)), + dims=["c", "y", "x"], + coords={"c": ["DAPI", "GFP"]}, + ), + }, + ) + ds.attrs["ome_channel_windows"] = [ + {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 1500.0}, + {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 2000.0}, + ] + + metadata = xarray_to_metadata(ds, preserve_original=False) + + channels = metadata["omero"]["channels"] + assert channels[0]["window"]["end"] == 1500.0 + assert channels[1]["window"]["end"] == 2000.0 + + +class TestRoundTrip: + """Test bidirectional conversion (round-trip fidelity).""" + + def test_simple_roundtrip(self) -> None: + """Test that metadata survives a round trip through xarray.""" + original_metadata = { + "name": "test_image", + "version": "0.4", + "axes": [ + {"name": "c", "type": "channel"}, + {"name": "y", "type": "space", "unit": "micrometer"}, + {"name": "x", "type": "space", "unit": "micrometer"}, + ], + } + + # Convert to attrs + attrs = metadata_to_xarray_attrs(original_metadata) + + # Create a dataset with these attrs + ds = xr.Dataset( + {"image": xr.DataArray(np.zeros((2, 10, 10)), dims=["c", "y", "x"])}, + ) + ds.attrs.update(attrs) + + # Convert back to metadata + reconstructed = xarray_to_metadata(ds, preserve_original=True) + + assert reconstructed == original_metadata + + def test_complex_metadata_roundtrip(self) -> None: + """Test round trip with complex metadata including OMERO.""" + original_metadata = { + "name": "complex_image", + "version": "0.4", + "axes": [ + {"name": "c", "type": "channel"}, + {"name": "z", "type": "space", "unit": "micrometer"}, + {"name": "y", "type": "space", "unit": "micrometer"}, + {"name": "x", "type": "space", "unit": "micrometer"}, + ], + "datasets": [ + {"path": "0"}, + {"path": "1"}, + {"path": "2"}, + ], + "omero": { + "channels": [ + { + "label": "DAPI", + "color": "0000FF", + "window": {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 1500.0}, + }, + { + "label": "GFP", + "color": "00FF00", + "window": {"min": 0.0, "max": 65535.0, "start": 0.0, "end": 2000.0}, + }, + ], + }, + } + + # Convert to attrs + attrs = metadata_to_xarray_attrs(original_metadata) + + # Create a dataset with channel labels in coordinates + ds = xr.Dataset( + { + "image": xr.DataArray( + np.zeros((2, 5, 10, 10)), + dims=["c", "z", "y", "x"], + coords={"c": ["DAPI", "GFP"]}, + ), + }, + ) + ds.attrs.update(attrs) + + # Convert back to metadata + reconstructed = xarray_to_metadata(ds, preserve_original=True) + + assert reconstructed == original_metadata + + def test_unknown_fields_preserved(self) -> None: + """Test that unknown/future metadata fields are preserved.""" + original_metadata = { + "name": "test", + "version": "0.5", # Future version + "future_field": "unknown_value", + "complex_nested": { + "level1": { + "level2": ["array", "of", "values"], + }, + }, + } + + attrs = metadata_to_xarray_attrs(original_metadata) + + ds = xr.Dataset( + {"image": xr.DataArray(np.zeros((10, 10)), dims=["y", "x"])}, + ) + ds.attrs.update(attrs) + + reconstructed = xarray_to_metadata(ds, preserve_original=True) + + # All unknown fields should be preserved exactly + assert reconstructed["future_field"] == "unknown_value" + assert reconstructed["complex_nested"]["level1"]["level2"] == ["array", "of", "values"] + + def test_numeric_channel_coords_roundtrip(self) -> None: + """Test round trip with numeric channel coordinates (no labels).""" + original_metadata = { + "name": "numeric_channels", + "version": "0.4", + "axes": [ + {"name": "c", "type": "channel"}, + {"name": "y", "type": "space"}, + {"name": "x", "type": "space"}, + ], + } + + attrs = metadata_to_xarray_attrs(original_metadata) + + # Dataset with numeric channel coordinates + ds = xr.Dataset( + { + "image": xr.DataArray( + np.zeros((3, 10, 10)), + dims=["c", "y", "x"], + coords={"c": [0, 1, 2]}, + ), + }, + ) + ds.attrs.update(attrs) + + reconstructed = xarray_to_metadata(ds, preserve_original=True) + + assert reconstructed == original_metadata diff --git a/xarray_ome/backend.py b/xarray_ome/backend.py index 798b7de..a7e4185 100644 --- a/xarray_ome/backend.py +++ b/xarray_ome/backend.py @@ -6,8 +6,10 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any +import xarray as xr from xarray.backends import BackendEntrypoint +from xarray_ome._store_utils import _detect_store_type from xarray_ome.reader import open_ome_dataset, open_ome_datatree if TYPE_CHECKING: @@ -44,6 +46,9 @@ def open_dataset( # type: ignore[override] ) -> Dataset: """Open a single resolution level from an OME-Zarr store. + If the zarr store is not OME-NGFF format, falls back to xarray's + native zarr backend. + Parameters ---------- filename_or_obj : str or PathLike @@ -61,6 +66,14 @@ def open_dataset( # type: ignore[override] Dataset containing the requested resolution level. """ path = str(filename_or_obj) if isinstance(filename_or_obj, os.PathLike) else filename_or_obj + + # Check if this is actually an OME-Zarr store + store_type = _detect_store_type(path) + + if store_type == "unknown": + # Not OME-NGFF format - delegate to xarray's zarr backend + return xr.open_dataset(path, engine="zarr", drop_variables=drop_variables) + ds = open_ome_dataset(path, resolution=resolution, validate=validate) if drop_variables is not None: @@ -77,6 +90,9 @@ def open_datatree( # type: ignore[override] ) -> DataTree: """Open an OME-Zarr store as a DataTree with all resolution levels. + If the zarr store is not OME-NGFF format, falls back to xarray's + native zarr backend (returning a DataTree with single node). + Parameters ---------- filename_or_obj : str or PathLike @@ -92,6 +108,16 @@ def open_datatree( # type: ignore[override] DataTree containing all resolution levels. """ path = str(filename_or_obj) if isinstance(filename_or_obj, os.PathLike) else filename_or_obj + + # Check if this is actually an OME-Zarr store + store_type = _detect_store_type(path) + + if store_type == "unknown": + # Not OME-NGFF format - delegate to xarray's zarr backend + # Wrap single dataset in DataTree + ds = xr.open_dataset(path, engine="zarr", drop_variables=drop_variables) + return xr.DataTree(ds) + dt = open_ome_datatree(path, validate=validate) if drop_variables is not None: diff --git a/xarray_ome/metadata.py b/xarray_ome/metadata.py new file mode 100644 index 0000000..47ef94b --- /dev/null +++ b/xarray_ome/metadata.py @@ -0,0 +1,266 @@ +"""Bidirectional conversion between OME-NGFF metadata and xarray structures. + +This module handles the conversion of OME-NGFF metadata to xarray coordinates +and attributes, and the reverse conversion for writing. + +Design Principle: +----------------- +Metadata that can be properly represented in xarray's data model should be +stored there (e.g., coordinates, dimension names), not duplicated in attrs. +Attrs should only contain metadata that has no native xarray representation. + +OME-NGFF Metadata Mapping: +-------------------------- +1. **Stored as xarray coordinates**: + - Axis scales/translations -> coordinate arrays (via transforms_to_coords) + - Channel labels (omero.channels[].label) -> channel coordinate values + - Time labels (if present) -> time coordinate values + +2. **Stored as xarray dimension names**: + - Axis names (axes[].name) -> Dataset.dims + +3. **Stored in attrs** (no native xarray representation): + - Axis types (axes[].type) + - Axis units (axes[].unit) - stored for reference, also derivable from coords + - Axis orientations (axes[].orientation) + - Image name + - OME-NGFF version + - Multiscale paths/resolutions + - Channel colors (omero.channels[].color) + - Channel window settings (omero.channels[].window) + - Full metadata dict (for complete round-tripping) + +Round-Tripping: +--------------- +The full OME-NGFF metadata dict is always preserved in attrs to ensure perfect +round-tripping, even if we don't fully understand all metadata fields. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import xarray as xr + + +def metadata_to_xarray_attrs(metadata_dict: dict[str, Any]) -> dict[str, Any]: + """Convert OME-NGFF metadata to xarray attrs (non-coordinate metadata only). + + This extracts metadata that cannot be represented as xarray coordinates + or dimension names. Coordinate-based metadata (scales, translations, + channel labels) should be handled separately via transforms_to_coords(). + + Parameters + ---------- + metadata_dict : dict + Full OME-NGFF metadata dictionary + + Returns + ------- + dict + Dictionary of attributes to add to Dataset/DataTree attrs + Does NOT include coordinate-based information + + Examples + -------- + >>> metadata = { + ... 'name': 'image', + ... 'version': '0.4', + ... 'axes': [ + ... {'name': 'c', 'type': 'channel'}, + ... {'name': 'z', 'type': 'space', 'unit': 'micrometer'}, + ... ], + ... 'omero': { + ... 'channels': [ + ... {'label': 'DAPI', 'color': '0000FF'}, + ... ], + ... }, + ... } + >>> attrs = metadata_to_xarray_attrs(metadata) + >>> attrs['ome_name'] + 'image' + >>> attrs['ome_axes_types'] + ['channel', 'space'] + """ + attrs = {} + + # Basic metadata + if "name" in metadata_dict: + attrs["ome_name"] = metadata_dict["name"] + if "version" in metadata_dict: + attrs["ome_version"] = metadata_dict["version"] + + # Axes information (types, units, orientations - not names, those are dims) + if "axes" in metadata_dict: + axes = metadata_dict["axes"] + + # Axis types (e.g., 'channel', 'space', 'time') + attrs["ome_axes_types"] = [ax.get("type") for ax in axes] + + # Units (if present) - maps axis name to unit + units = {ax["name"]: ax.get("unit") for ax in axes if ax.get("unit")} + if units: + attrs["ome_axes_units"] = units + + # Orientations (if present) - maps axis name to orientation + orientations = {ax["name"]: ax.get("orientation") for ax in axes if ax.get("orientation")} + if orientations: + attrs["ome_axes_orientations"] = orientations + + # Multiscale dataset information + if "datasets" in metadata_dict: + datasets = metadata_dict["datasets"] + attrs["ome_multiscale_paths"] = [ds["path"] for ds in datasets] + attrs["ome_num_resolutions"] = len(datasets) + + # OMERO metadata (channel colors, rendering settings) + # Note: channel labels are NOT stored here - they go in coordinates + if "omero" in metadata_dict and metadata_dict["omero"]: + omero = metadata_dict["omero"] + + # Channel information (colors and rendering only) + if "channels" in omero: + channels = omero["channels"] + + # Channel colors (hex RGB) + colors = [ch.get("color") for ch in channels] + if any(c is not None for c in colors): + attrs["ome_channel_colors"] = colors + + # Window settings (rendering) + windows = [ch.get("window") for ch in channels if "window" in ch] + if windows: + attrs["ome_channel_windows"] = windows + + # Always keep full metadata for complete round-tripping + attrs["ome_ngff_metadata"] = metadata_dict + + return attrs + + +def xarray_to_metadata( + dataset: xr.Dataset, + *, + preserve_original: bool = True, +) -> dict[str, Any]: + """Convert xarray Dataset back to OME-NGFF metadata dictionary. + + Reconstructs OME-NGFF metadata from xarray coordinates, dimensions, + and attributes. Ensures round-trip fidelity. + + Parameters + ---------- + dataset : xr.Dataset + Dataset to extract metadata from + preserve_original : bool, default True + If True and 'ome_ngff_metadata' is in attrs, use that as the base + and only update fields that might have changed (for perfect round-trip). + If False, reconstruct metadata entirely from xarray structure. + + Returns + ------- + dict + OME-NGFF metadata dictionary ready for writing + + Notes + ----- + When preserve_original=True (default), this ensures perfect round-tripping + by preserving all metadata fields we don't actively use, even if we don't + understand them. + + Examples + -------- + >>> import xarray as xr + >>> import numpy as np + >>> ds = xr.Dataset({ + ... 'image': xr.DataArray( + ... np.zeros((2, 10, 10)), + ... dims=['c', 'y', 'x'], + ... coords={'c': ['DAPI', 'GFP']}, + ... ), + ... }) + >>> ds.attrs['ome_name'] = 'test' + >>> ds.attrs['ome_version'] = '0.4' + >>> metadata = xarray_to_metadata(ds, preserve_original=False) + >>> metadata['name'] + 'test' + """ + # Start with original metadata if available and requested + if preserve_original and "ome_ngff_metadata" in dataset.attrs: + metadata = dict(dataset.attrs["ome_ngff_metadata"]) + else: + metadata = {} + + # Update basic metadata from attrs + if "ome_name" in dataset.attrs: + metadata["name"] = dataset.attrs["ome_name"] + if "ome_version" in dataset.attrs: + metadata["version"] = dataset.attrs["ome_version"] + + # Reconstruct axes from dimensions and attrs + # Get the first data variable to access dimensions + first_var = next(iter(dataset.data_vars)) + data_array = dataset[first_var] + + axes = [] + for i, dim in enumerate(data_array.dims): + axis = {"name": str(dim)} + + # Add type if available + if "ome_axes_types" in dataset.attrs: + axis_type = dataset.attrs["ome_axes_types"][i] + if axis_type is not None: + axis["type"] = axis_type + + # Add unit if available + if "ome_axes_units" in dataset.attrs: + units_dict = dataset.attrs["ome_axes_units"] + if dim in units_dict: + axis["unit"] = units_dict[dim] + + # Add orientation if available + if "ome_axes_orientations" in dataset.attrs: + orient_dict = dataset.attrs["ome_axes_orientations"] + if dim in orient_dict: + axis["orientation"] = orient_dict[dim] + + axes.append(axis) + + metadata["axes"] = axes + + # Reconstruct OMERO metadata if channel info present + if "ome_channel_colors" in dataset.attrs or "ome_channel_windows" in dataset.attrs: + omero = metadata.get("omero", {}) + channels = omero.get("channels", []) + + # Ensure we have enough channel entries + if "c" in dataset.coords: + n_channels = len(dataset.coords["c"]) + while len(channels) < n_channels: + channels.append({}) + + # Add channel labels from coordinates + channel_coords = dataset.coords["c"].values + if channel_coords.dtype.kind in ("U", "S", "O"): # String types + for i, label in enumerate(channel_coords): + channels[i]["label"] = str(label) + + # Add colors if present + if "ome_channel_colors" in dataset.attrs: + colors = dataset.attrs["ome_channel_colors"] + for i, color in enumerate(colors): + if color is not None and i < len(channels): + channels[i]["color"] = color + + # Add windows if present + if "ome_channel_windows" in dataset.attrs: + windows = dataset.attrs["ome_channel_windows"] + for i, window in enumerate(windows): + if window is not None and i < len(channels): + channels[i]["window"] = window + + omero["channels"] = channels + metadata["omero"] = omero + + return metadata diff --git a/xarray_ome/reader.py b/xarray_ome/reader.py index 0661ab2..a86b478 100644 --- a/xarray_ome/reader.py +++ b/xarray_ome/reader.py @@ -8,6 +8,7 @@ from ngff_zarr import NgffImage, from_ngff_zarr # type: ignore[import-untyped] from ._store_utils import _detect_store_type +from .metadata import metadata_to_xarray_attrs from .transforms import transforms_to_coords if TYPE_CHECKING: @@ -44,24 +45,29 @@ def open_ome_datatree(path: str | Path, validate: bool = False) -> xr.DataTree: Currently only supports simple multiscale images. HCS (High Content Screening) plate structures are not yet supported. """ + # First check if this is actually an OME-Zarr store + store_type = _detect_store_type(str(path)) + + if store_type == "unknown": + msg = ( + f"The zarr store at '{path}' does not appear to be OME-NGFF format. " + "It may be a regular zarr file. Try opening with engine='zarr' instead." + ) + raise ValueError(msg) + + if store_type == "hcs": + msg = ( + f"The OME-Zarr store at '{path}' appears to be an HCS (High Content " + "Screening) plate structure, which is not yet supported. " + "Currently only simple multiscale images are supported." + ) + raise ValueError(msg) + try: multiscales = from_ngff_zarr(str(path), validate=validate) except KeyError as e: - if "multiscales" in str(e): - store_type = _detect_store_type(str(path)) - if store_type == "hcs": - msg = ( - f"The OME-Zarr store at '{path}' appears to be an HCS (High Content " - "Screening) plate structure, which is not yet supported. " - "Currently only simple multiscale images are supported." - ) - raise ValueError(msg) from e - msg = ( - f"The OME-Zarr store at '{path}' does not contain multiscale metadata. " - "It may be an unsupported OME-Zarr structure." - ) - raise ValueError(msg) from e - raise + # Fallback error handling + raise ValueError(f"Failed to parse OME-NGFF metadata from '{path}'") from e # Extract metadata dict for passing to conversion metadata_dict = _metadata_to_dict(multiscales.metadata) @@ -76,8 +82,9 @@ def open_ome_datatree(path: str | Path, validate: bool = False) -> xr.DataTree: # Create the root DataTree with children dt = xr.DataTree(children=children, name="root") - # Store the full OME-NGFF metadata in root attrs - dt.attrs["ome_ngff_metadata"] = metadata_dict + # Add OME-NGFF metadata as attrs (coordinate-based metadata is in coords) + metadata_attrs = metadata_to_xarray_attrs(metadata_dict) + dt.attrs.update(metadata_attrs) return dt @@ -188,22 +195,9 @@ def _ngff_image_to_dataset( dataset = xr.Dataset({ngff_image.name: data_array}) # Store scale and translation in attrs for round-tripping + # These are needed for coords_to_transforms() to work efficiently dataset.attrs["ome_scale"] = ngff_image.scale dataset.attrs["ome_translation"] = ngff_image.translation - if ngff_image.axes_units: - dataset.attrs["ome_axes_units"] = dict(ngff_image.axes_units) - if ngff_image.axes_orientations: - dataset.attrs["ome_axes_orientations"] = { - k: str(v) for k, v in ngff_image.axes_orientations.items() - } - - # Store image name from OME metadata if available - if metadata and "name" in metadata: - dataset.attrs["ome_image_name"] = metadata["name"] - - # Store channel information for reference - if channel_labels: - dataset.attrs["ome_channel_labels"] = channel_labels return dataset @@ -249,24 +243,29 @@ def open_ome_dataset(path: str | Path, resolution: int = 0, validate: bool = Fal Currently only supports simple multiscale images. HCS (High Content Screening) plate structures are not yet supported. """ + # First check if this is actually an OME-Zarr store + store_type = _detect_store_type(str(path)) + + if store_type == "unknown": + msg = ( + f"The zarr store at '{path}' does not appear to be OME-NGFF format. " + "It may be a regular zarr file. Try opening with engine='zarr' instead." + ) + raise ValueError(msg) + + if store_type == "hcs": + msg = ( + f"The OME-Zarr store at '{path}' appears to be an HCS (High Content " + "Screening) plate structure, which is not yet supported. " + "Currently only simple multiscale images are supported." + ) + raise ValueError(msg) + try: multiscales = from_ngff_zarr(str(path), validate=validate) except KeyError as e: - if "multiscales" in str(e): - store_type = _detect_store_type(str(path)) - if store_type == "hcs": - msg = ( - f"The OME-Zarr store at '{path}' appears to be an HCS (High Content " - "Screening) plate structure, which is not yet supported. " - "Currently only simple multiscale images are supported." - ) - raise ValueError(msg) from e - msg = ( - f"The OME-Zarr store at '{path}' does not contain multiscale metadata. " - "It may be an unsupported OME-Zarr structure." - ) - raise ValueError(msg) from e - raise + # Fallback error handling + raise ValueError(f"Failed to parse OME-NGFF metadata from '{path}'") from e # Extract metadata dict for passing to conversion metadata_dict = _metadata_to_dict(multiscales.metadata) @@ -287,6 +286,9 @@ def open_ome_dataset(path: str | Path, resolution: int = 0, validate: bool = Fal # Store metadata about resolution level dataset.attrs["ome_ngff_resolution"] = resolution - dataset.attrs["ome_ngff_metadata"] = metadata_dict + + # Add OME-NGFF metadata as attrs (coordinate-based metadata is in coords) + metadata_attrs = metadata_to_xarray_attrs(metadata_dict) + dataset.attrs.update(metadata_attrs) return dataset