Skip to content

Concatenation of virtual datasets fails due to missing Chunk Manager #382

@observingClouds

Description

@observingClouds

What I did
I created kerchunk reference files following #381 and tried to combine them via xr.merge.

import xarray as xr
import tempfile
import json
from pathlib import Path
import virtualizarr as vz
import os

#Create xarray dataset
ds1 = xr.Dataset(
    {
        "a": (("x", "y"), [[1, 2], [3, 4]]),
        "b": (("x", "y"), [[10, 20], [30, 40]]),
    },
    coords={"x": [10, 20], "y": [1, 2]},
)

ds2 = xr.Dataset(
    {
        "c": (("x", "y"), [[5, 6], [7, 8]]),
        "d": (("x", "y"), [[50, 60], [70, 80]]),
    },
    coords={"x": [1, 2], "y": [1, 2]},
)

ref1 = ds1.virtualize.to_kerchunk()
ref2 = ds2.virtualize.to_kerchunk()

## 'Outlining' all references as inlined data is not yet supported
"""NotImplementedError: Outlining all references as inlined data is not yet supported. [ToDo]"""
tempdir1 = Path(tempfile.TemporaryDirectory().name)
tempdir2 = Path(tempfile.TemporaryDirectory().name)
def outline_references(ref: dict, folder: Path = None) -> dict:
    """
    Virtualizarr currently does not support inlined references.
    To open references with virtualizarr, the references must be written to a file.
    Except the .zarray, .zattrs and .zgroup files, all references are written to disk.
    """
    refs = ref["refs"]
    for k, v in refs.items():
        if os.path.basename(k).startswith('.'):
            continue
        elif isinstance(v, str):
            file = folder / k
            if not os.path.exists(os.path.dirname(file)):
                os.makedirs(os.path.dirname(file))
            with open(folder / k, "w") as f:
                f.write(v)
            refs[k] = [str(file), 0, v.__sizeof__()]
    return ref

ref1 = outline_references(ref1, tempdir1)
ref2 = outline_references(ref2, tempdir2)


## Write references to disk (open_virtual_dataset expects a string)
with open("ref1.json", "w") as f:
    json.dump(ref1, f)

with open("ref2.json", "w") as f:
    json.dump(ref2, f)

# Note this section requires the modification in #381 
vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
vds2 = vz.open_virtual_dataset("ref2.json", filetype='kerchunk')

xr.merge([vds1, vds2])

What happened

TypeError                                 Traceback (most recent call last)
Cell In[20], line 66
     63 vds1 = vz.open_virtual_dataset("ref1.json", filetype='kerchunk')
     64 vds2 = vz.open_virtual_dataset("ref2.json", filetype='kerchunk')
---> 66 xr.merge([vds1, vds2])

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:976, in merge(objects, compat, join, fill_value, combine_attrs)
    973         obj = obj.to_dataset()
    974     dict_like_objects.append(obj)
--> 976 merge_result = merge_core(
    977     dict_like_objects,
    978     compat,
    979     join,
    980     combine_attrs=combine_attrs,
    981     fill_value=fill_value,
    982 )
    983 return Dataset._construct_direct(**merge_result._asdict())

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:701, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value, skip_align_args)
    699 collected = collect_variables_and_indexes(aligned, indexes=indexes)
    700 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
--> 701 variables, out_indexes = merge_collected(
    702     collected, prioritized, compat=compat, combine_attrs=combine_attrs
    703 )
    705 dims = calculate_dimensions(variables)
    707 coord_names, noncoord_names = determine_coords(coerced)

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:290, in merge_collected(grouped, prioritized, compat, combine_attrs, equals)
    288 variables = [variable for variable, _ in elements_list]
    289 try:
--> 290     merged_vars[name] = unique_variable(
    291         name, variables, compat, equals.get(name, None)
    292     )
    293 except MergeError:
    294     if compat != "minimal":
    295         # we need more than "minimal" compatibility (for which
    296         # we drop conflicting coordinates)

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/merge.py:137, in unique_variable(name, variables, compat, equals)
    133         break
    135 if equals is None:
    136     # now compare values with minimum number of computes
--> 137     out = out.compute()
    138     for var in variables[1:]:
    139         equals = getattr(out, compat)(var)

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/variable.py:1026, in Variable.compute(self, **kwargs)
   1008 """Manually trigger loading of this variable's data from disk or a
   1009 remote source into memory and return a new variable. The original is
   1010 left unaltered.
   (...)
   1023 dask.array.compute
   1024 """
   1025 new = self.copy(deep=False)
-> 1026 return new.load(**kwargs)

File ~/virtualizarr/lib/python3.10/site-packages/xarray/core/variable.py:1004, in Variable.load(self, **kwargs)
    987 def load(self, **kwargs):
    988     """Manually trigger loading of this variable's data from disk or a
    989     remote source into memory and return this variable.
    990 
   (...)
   1002     dask.array.compute
   1003     """
-> 1004     self._data = to_duck_array(self._data, **kwargs)
   1005     return self

File ~/virtualizarr/lib/python3.10/site-packages/xarray/namedarray/pycompat.py:129, in to_duck_array(data, **kwargs)
    126 from xarray.namedarray.parallelcompat import get_chunked_array_type
    128 if is_chunked_array(data):
--> 129     chunkmanager = get_chunked_array_type(data)
    130     loaded_data, *_ = chunkmanager.compute(data, **kwargs)  # type: ignore[var-annotated]
    131     return loaded_data

File ~/virtualizarr/lib/python3.10/site-packages/xarray/namedarray/parallelcompat.py:158, in get_chunked_array_type(*args)
    152 selected = [
    153     chunkmanager
    154     for chunkmanager in chunkmanagers.values()
    155     if chunkmanager.is_chunked_array(chunked_arr)
    156 ]
    157 if not selected:
--> 158     raise TypeError(
    159         f"Could not find a Chunk Manager which recognises type {type(chunked_arr)}"
    160     )
    161 elif len(selected) >= 2:
    162     raise TypeError(f"Multiple ChunkManagers recognise type {type(chunked_arr)}")

TypeError: Could not find a Chunk Manager which recognises type <class 'virtualizarr.manifests.array.ManifestArray'>

What I expected
I expected the virtual equivalent to the "real" datasets to return:

>>> xr.merge([ds1, ds2])
<xarray.Dataset> Size: 304B
Dimensions:  (x: 4, y: 2)
Coordinates:
  * x        (x) int64 32B 1 2 10 20
  * y        (y) int64 16B 1 2
Data variables:
    a        (x, y) float64 64B nan nan nan nan 1.0 2.0 3.0 4.0
    b        (x, y) float64 64B nan nan nan nan 10.0 20.0 30.0 40.0
    c        (x, y) float64 64B 5.0 6.0 7.0 8.0 nan nan nan nan
    d        (x, y) float64 64B 50.0 60.0 70.0 80.0 nan nan nan nan
Attributes:
    coordinates:  x y

Environment
vz.version = 1.2.1.dev19+g0d2d6ab

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions